library(dplyr)
library(data.table)
library(cowplot)
library(ggraph)
library(igraph)
source("utils.R")
knitr::opts_chunk$set(
fig.path = "decomposition_figs/",
fig.keep = "high",
dev = c("pdf", "png")
)
data.dir <- "../../data/"
source("load_data.R")
'measure.vars' [Abundance_e9312, Abundance_eMED4, Abundance_NATL, Abundancee_SS120, Abundancee_9313] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'character'. All measure variables not of type 'character' will be coerced to. Check DETAILS in ?melt.data.table for more on coercion.NAs introduced by coercion
Raw relative abundance time series of ecotypes at depths:
bats[, rel.abund := abundance / sum(abundance), by = depth]
bats[, log.rel.abund := log10(rel.abund)]
ggplot(bats, aes(x = month, y = log.rel.abund)) +
geom_path(aes(group = ecotype)) +
facet_grid(depth ~ ecotype) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Power spectrum of ecotypes at depths:
# spectral analysis per ecotype per depth
# mean per month if more than one sample
bats.depthlra <- bats[, .(mn = mean(log.rel.abund)),
by = .(month, depth, ecotype)]
bats.spec <- bats.depthlra[, {
s <- stats::spectrum(mn, plot = FALSE)
data.table(freq = s$freq, spec = s$spec)
}, by = .(ecotype, depth)]
# normalize so peak is 1
bats.spec[, spec := spec / max(spec), by = .(depth, ecotype)]
bats.spec[, timescale.months := 1 / freq]
ggplot(bats.spec, aes(x = timescale.months, y = spec)) +
geom_point(size = 0.5) +
geom_path(aes(group = ecotype)) +
facet_grid(depth ~ ecotype) +
geom_vline(aes(linetype = "12"), xintercept = 12, color = "blue") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Blue line indicates periodicity of 1 year. Relative height of peaks gives relative magnitude of periodic fluctuations at that time scale. This shows that not all ecotypes at all depths are most strongly driven by annual cycling. E.g. e9312 apparently experiences strong annual cycling at all depths except 100. NATL experiences annual cycling at all depths but it is relatively weaker at depths 160-200.
setkey(bats, month)
# all pairs
bats[, id := paste(month, depth, sep = "_")]
bats.ids <- unique(bats$id)
bats.div <- MakeNonRedundantPairs(bats.ids, prefix = "id")
bats.div[, c("month.i", "depth.i") := tstrsplit(id.i, "_")]
bats.div[, c("month.j", "depth.j") := tstrsplit(id.j, "_")]
# it would be nice to get distances between depths but we'll do same depth rn
# down by order of magnitude
bats.div <- bats.div[depth.i == depth.j & month.j >= month.i]
bats.div[, depth.j := NULL]
setnames(bats.div, "depth.i", "depth")
setkey(bats, id)
bats.div[, jsd := {
if (id.i == id.j) {
0
} else {
# browser()
sd <- bats[c(id.i, id.j)]
sd <- dcast(sd, ecotype ~ id, value.var = "rel.abund",
fun.aggregate = mean)
sd <- as.matrix(sd[, -1])
genJSD(sd)
}
}, by = .(id.i, id.j)]
rec <- bats.div[id.i != id.j]
setnames(rec, names(rec), sapply(names(rec), function(str) {
if (grepl("\\.i", str)) sub("\\.i", "\\.j", str)
else if (grepl("\\.j", str)) sub("\\.j", "\\.i", str)
else str
}))
bats.div <- rbind(bats.div, rec)
bats.div[, distance := sqrt(jsd)]
bats.div[, ":=" (month.i = as.numeric(month.i),
month.j = as.numeric(month.j))]
bats.div[, delta := month.j - month.i]
vertices <- unique(bats[, .(id, month, cal.month, depth)])
edges <- bats.div[delta == 1, .(id.i, id.j, depth = as.numeric(depth))]
graph <- graph_from_data_frame(edges, directed = FALSE, vertices)
bats.depths <- unique(bats$depth)
bats.depths <- bats.depths[order(as.numeric(bats.depths))]
names(bats.depths) <- bats.depths
bats.dms <- lapply(bats.depths, function(de) {
d <- bats.div[depth == de]
MakeDistMatrix(d, "id.i", "id.j")
})
fits <- lapply(bats.dms, function(dm) {
CoordCMDS(dm)
})
xforms <- lapply(fits, function(x) {
x$points
}) %>% rbindlist(use.names = TRUE, idcol = "depth")
eigranks <- lapply(fits, function(x) {
x$eigrank
}) %>% rbindlist(use.names = TRUE, idcol = "depth")
eigranks[, depth := factor(depth, levels = bats.depths)]
setkey(xforms, sample)
lo <- create_layout(graph, "manual", node.positions = xforms[V(graph)$name])
mbreaks <- c(1, 3, 6, 9, 12) # Jan, plus months of solstices + equinoxes
ggraph(lo) +
geom_edge_link(arrow = arrow(type = "closed", length = unit(3, "points")),
end_cap = square(length = 3, unit = "points"),
edge_width = 0.2) +
geom_node_point(aes(color = cal.month), size = 0.5) +
scale_color_gradientn(values = scales::rescale(mbreaks),
colors = c("blue", "green4", "yellow", "orange",
"blue")
) +
facet_wrap(~ depth, ncol = 3) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
This also shows that seasonal cycling is more reproducible at some depths. For depths shallower than 100, there seems to be a reproducible path, and the composition seems to change more gradually, as can be seen in the close spacing of most consecutive timepoints. For depths 120-160, there are also reproducible paths, however, there are 3 clusters of time points (seemingly representing spring, summer/early fall, and late fall/winter) with large gaps between them, and the long leaps between clusters suggests the seasonal compositional transitions are more drastic. Finally, depths 180-200 appear more random.
Eigenvalues of MDS show that this transformation captures most of the information:
ggplot(eigranks, aes(x = rank, y = value ^2)) +
geom_point() +
scale_x_log10() +
facet_wrap(~ depth, scales = "free_y")
Heatmapping the JSDs shows occupancy of stable states as dark squares of high temporal similarity along the diagonal. Recurrence of stable states show as dark off-diagonal rectangles. It is clear the time scales of stability (sizes of on-diagonal dark squares) are different for depths 1-40 and 100-160. 60-80 seem to be a hybrid of the two regimes, and 180-200 seem to go toward randomness.
bats.div[, depth := as.numeric(depth)]
ggplot(bats.div, aes(x = month.i, y = month.j)) +
geom_tile(aes(fill = jsd)) +
facet_wrap(~ depth, ncol = 3) +
theme(aspect.ratio = 1)
David-style, JSD-based ‘autocorrelation.’ The periodicity of many of the autocorrelation functions shows strong periodic dynamics at a single time scale.
ggplot(bats.div[delta > 0], aes(x = delta, y = jsd)) +
geom_point(size = 0.2) +
geom_smooth() +
facet_wrap(~ depth, ncol = 3)
hot[, rel.abund := abundance / sum(abundance), by = depth]
hot[, log.rel.abund := log10(rel.abund)]
ggplot(hot, aes(x = month, y = log.rel.abund)) +
geom_path(aes(group = ecotype)) +
facet_grid(depth ~ ecotype) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
ggplot(eigranks, aes(x = rank, y = value ^2)) +
geom_point() +
scale_x_log10() +
facet_wrap(~ depth, scales = "free_y")
Shallower depths are more stable in composition:
setkey(hot, month)
# all pairs
hot[, id := paste(month, depth, sep = "_")]
hot.ids <- unique(hot$id)
hot.div <- MakeNonRedundantPairs(hot.ids, prefix = "id")
hot.div[, c("month.i", "depth.i") := tstrsplit(id.i, "_")]
hot.div[, c("month.j", "depth.j") := tstrsplit(id.j, "_")]
# it would be nice to get distances between depths but we'll do same depth rn
# down by order of magnitude
hot.div <- hot.div[depth.i == depth.j & month.j >= month.i]
hot.div[, depth.j := NULL]
setnames(hot.div, "depth.i", "depth")
setkey(hot, id)
hot.div[, jsd := {
if (id.i == id.j) {
0
} else {
sd <- hot[c(id.i, id.j)]
sd <- dcast(sd, ecotype ~ id, value.var = "rel.abund",
fun.aggregate = mean)
sd <- as.matrix(sd[, -1])
genJSD(sd)
}
}, by = .(id.i, id.j)]
rec <- hot.div[id.i != id.j]
setnames(rec, names(rec), sapply(names(rec), function(str) {
if (grepl("\\.i", str)) sub("\\.i", "\\.j", str)
else if (grepl("\\.j", str)) sub("\\.j", "\\.i", str)
else str
}))
hot.div <- rbind(hot.div, rec)
hot.div[, distance := sqrt(jsd)]
hot.div[, ":=" (month.i = as.numeric(month.i),
month.j = as.numeric(month.j))]
hot.div[, delta := month.j - month.i]
ggplot(hot.div, aes(x = distance)) +
stat_ecdf(aes(color = factor(depth,
levels = as.character(
sort(
unique(
as.numeric(depth)
)
)
)
)
)) +
labs(color = "depth")
setkey(xforms, sample)
lo <- create_layout(graph, "manual", node.positions = xforms[V(graph)$name])
mbreaks <- c(1, 3, 6, 9, 12) # Jan, plus months of solstices + equinoxes
ggraph(lo) +
geom_edge_link(arrow = arrow(type = "closed", length = unit(3, "points")),
end_cap = square(length = 3, unit = "points"),
edge_width = 0.2) +
geom_node_point(aes(color = cal.month), size = 0.5) +
scale_color_gradientn(values = scales::rescale(mbreaks),
colors = c("blue", "green4", "yellow", "orange",
"blue")
) +
facet_wrap(~ depth, ncol = 3) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
There are too many OTUs. They should probably be aggregated somehow. Here we aggregate all reads that map to the same phylum, which results in a manageable number of variables. But one could just as well aggregate at any other taxonomic level or not at all.
hot.div[, depth := as.numeric(as.character(depth))]
ggplot(hot.div, aes(x = month.i, y = month.j)) +
geom_tile(aes(fill = jsd)) +
facet_wrap(~ depth, ncol = 3) +
theme(aspect.ratio = 1)
phyla %>%
group_by(phylum) %>%
mutate(freq = scales::rescale(freq)) %>%
ggplot(aes(x = as.numeric(day), y = freq)) +
theme(strip.text = element_text(size = 10),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
geom_line(aes(group = phylum)) +
facet_wrap(~ phylum, ncol = 4) +
labs(y = "frequency (normalized)")
We can look at the spectral density of the phyla time series. Phyla with peaks in the low frequencies exhibit slow fluctuations. It seems there really are slow and broad-spectrum taxa at this level.
phyla.spec <- phyla %>%
dcast(day ~ phylum, value.var = "freq") %>%
select(-day) %>%
stats::spectrum(plot = FALSE)
phyla.spec <- data.table(cbind(phyla.spec$freq, phyla.spec$spec)) %>%
setnames(c("freq", phyla.spec$snames)) %>%
melt(id.vars = "freq", variable.name = "phylum")
phyla.spec %>%
group_by(phylum) %>%
mutate(value = value / sum(value)) %>%
ggplot(aes(x = freq, y = phylum)) +
labs(x = "frequency (1/day)") +
geom_tile(aes(fill = value))
Are there qualitative coarse-grainable compositional states in the Nahant data?
phyla[, day := as.character(day)]
days <- unique(phyla[, day])
day.div <- data.table(expand.grid(day.i = days, day.j = days))
day.div[, ":=" (day.i = as.character(day.i),
day.j = as.character(day.j))]
setkey(phyla, day)
day.div[, jsd := {
if (day.i == day.j) {
0
} else {
sd <- phyla[c(day.i, day.j)]
sd <- dcast(sd, phylum ~ day, value.var = "freq")
m <- as.matrix(sd[, 2:3])
d <- genJSD(m)
d
}
}, by = .(day.i, day.j)]
day.div[, distance := sqrt(jsd)]
Layout the samples by Jensen-Shannon distance using mds algorithm. Color by day so as to see temporal progression. Hierarchically cluster by JS distance.
edges <- cbind(days[-length(days)], days[-1])
graph <- graph_from_data_frame(edges, directed = FALSE,
vertices = data.table(id = days,
day = as.numeric(days)))
dm <- MakeDistMatrix(day.div, "day.i", "day.j")
fit <- CoordCMDS(dm)
xform <- fit$points
# order
setkey(xform, sample)
xform <- xform[V(graph)$name]
layout <- create_layout(graph, "manual", node.positions = xform)
p1 <- ggraph(layout) +
geom_edge_link(arrow = arrow(type = "closed",
length = unit(5, "points")),
edge_width = 0.2,
end_cap = square(length = 5, unit = "points")
) +
geom_node_point(aes(color = day)) +
theme_graph(base_family = "Helvetica") +
scale_color_distiller(palette = "Spectral")
# clustering
hc <- hclust(dist(dm))
dendro <- as.dendrogram(hc)
dendro <- dendrapply(dendro, function(d) {
if (is.leaf(d)) {
attr(d, "nodePar") <- list(day = as.numeric(attr(d, "label")))
}
d
})
p2 <- ggraph(dendro, "dendrogram") +
geom_edge_elbow() +
# geom_node_text(aes(filter = leaf, color = day, label = day), angle = 90, size = 2) +
geom_node_point(aes(filter = leaf, color = day)) +
scale_color_distiller(palette = "Spectral") +
theme_graph(base_family = "Helvetica") #+
# theme(aspect.ratio = 1)
plot_grid(p1, p2, nrow = 2, align = "hv", labels = c("A", "B"))
These data show a clear transition between stable states around day 240. These two states form the two major branches at the lowest level of the hierarchical clustering.
The eigenvalues of the MDS transform show that information is well preserved:
ggplot(fit$eigrank, aes(x = rank, y = value ^ 2)) +
geom_point()
Distribution of daily compositional step sizes. The Maxwell-Boltzmann/lognormal shape to the density at \(\Delta(t) = 1\) suggests that under this time resolution the change in composition behaves like some kind of random walk, with a characteristic step size and rare changes of larger magnitude.
day.div <- day.div[, day.delta := as.numeric(day.j) - as.numeric(day.i)]
p1 <- ggplot(day.div[day.delta == 1,], aes(x = distance)) +
stat_bin(bins = 30)
p2 <- ggplot(day.div[day.delta == 1], aes(x = as.numeric(day.i), y = distance)) +
geom_point() +
# stat_smooth() +
labs(x = "day i")
plot_grid(p1, p2, nrow = 2, labels = "AUTO", align = "hv")
day.div[day.delta > 0] %>%
mutate(group = as.factor(floor(day.delta / 10) * 10),
ddelta = as.factor(day.delta %% 10)) %>%
ggplot(aes(x = distance)) +
stat_bin(bins = 30, position = "identity") +
scale_color_brewer(palette = "Spectral") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1,
size = 7
),
axis.text.y = element_text(size = 6)
) +
facet_grid(group ~ ddelta, scales = "free_y")
Compositional distance as function of interval. This is sort of an autocorrelation spectrum. There are no peaks in this ‘spectrum’ suggesting there are no characteristic time scales of periodic behavior. Since there is only one major state transition in these data, it could also be interpreted as the period of state transitions being longer than the observation time, when interpreted as periodic.
br <- days[seq(1, length(days), by = 5)]
ggplot(day.div, aes(x = day.i, y = day.j)) +
geom_tile(aes(fill = distance)) +
scale_x_discrete(breaks = br) +
scale_y_discrete(breaks = br) +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
tl <- day.div[day.delta > 0, .(mn.dist = mean(distance)), by = day.delta]
p0 <- ggplot(day.div[day.delta > 0], aes(x = day.delta, y = distance))
p1 <- p0 + geom_point(size = 1) +
# stat_smooth(aes(linetype = "mean")) +
geom_line(aes(y = mn.dist, linetype = "mean"), tl, color = "blue", size = 2) +
labs(linetype = "", x = "interval (days)")
p2 <- p0 + stat_bin_2d(binwidth = c(1, 0.01)) + labs(x = "interval (days)")
plot_grid(p1, p2, nrow = 2, align = "hv", labels = "AUTO")
david <- fread(paste0(data.dir, "david/david.otus"),
col.names = c("sample", "otu", "count")
)
# parse subject and timepoints
david.samples <- unique(david[, .(sample)])
david.samples[, c("subject", "day") := tstrsplit(sample, "_")]
# log transform otu counts
david[, log.count := log(count + 1)]
# split by subject
david.subjects <- unique(david.samples$subject)
names(david.subjects) <- david.subjects
# create tables of day-day divergences
david.div <- data.table(expand.grid(sample.i = david.samples$sample,
sample.j = david.samples$sample)
)
david.samples[, idx := 1:.N]
david.samples[, subj.idx := frank(as.numeric(day)), by = subject]
david.div <- merge(david.div, david.samples, by.x = "sample.i",
by.y = "sample")
david.div <- merge(david.div, david.samples, by.x = "sample.j",
by.y = "sample", suffixes = c(".i", ".j"))
david.div <- david.div[idx.j >= idx.i]
setkey(david, sample)
david.div[, jsd := {
if (sample.i == sample.j) {
0
} else {
sd <- david[c(sample.i, sample.j)]
m <- dcast(sd, otu ~ sample, value.var = "count")
m <- as.matrix(m[, 2:3])
m[is.na(m)] <- 0
genJSD(m)
}
}, by = .(sample.i, sample.j)]
# reciprocal distances
david.div <- david.div[sample.i != sample.j] %>%
setnames(names(david.div), sapply(names(david.div), function(x) {
if (grepl("idx\\.i", x)) {
gsub("idx\\.i", "idx\\.j", x)
} else if (grepl("idx\\.j", x)) {
gsub("idx\\.j", "idx\\.i", x)
} else if (grepl("\\.i", x)) {
gsub("\\.i", "\\.j", x)
} else if (grepl("\\.j", x)) {
gsub("\\.j", "\\.i", x)
} else x
})
) %>%
rbind(david.div)
david.div[, distance := sqrt(jsd)]
david.div[, day.delta := as.numeric(day.j) - as.numeric(day.i)]
david.div[subject.i == subject.j, increment := subj.idx.j - subj.idx.i]
Merge event metadata:
david <- fread(paste0(data.dir, "david/david.otus"),
col.names = c("sample", "otu", "count")
)
# parse subject and timepoints
david.samples <- unique(david[, .(sample)])
david.samples[, c("subject", "day") := tstrsplit(sample, "_")]
# log transform otu counts
david[, log.count := log(count + 1)]
# split by subject
david.subjects <- unique(david.samples$subject)
names(david.subjects) <- david.subjects
# create tables of day-day divergences
david.div <- data.table(expand.grid(sample.i = david.samples$sample,
sample.j = david.samples$sample)
)
david.samples[, idx := 1:.N]
david.samples[, subj.idx := frank(as.numeric(day)), by = subject]
david.div <- merge(david.div, david.samples, by.x = "sample.i",
by.y = "sample")
david.div <- merge(david.div, david.samples, by.x = "sample.j",
by.y = "sample", suffixes = c(".i", ".j"))
david.div <- david.div[idx.j >= idx.i]
setkey(david, sample)
david.div[, jsd := {
if (sample.i == sample.j) {
0
} else {
sd <- david[c(sample.i, sample.j)]
m <- dcast(sd, otu ~ sample, value.var = "count")
m <- as.matrix(m[, 2:3])
m[is.na(m)] <- 0
genJSD(m)
}
}, by = .(sample.i, sample.j)]
# reciprocal distances
david.div <- david.div[sample.i != sample.j] %>%
setnames(names(david.div), sapply(names(david.div), function(x) {
if (grepl("idx\\.i", x)) {
gsub("idx\\.i", "idx\\.j", x)
} else if (grepl("idx\\.j", x)) {
gsub("idx\\.j", "idx\\.i", x)
} else if (grepl("\\.i", x)) {
gsub("\\.i", "\\.j", x)
} else if (grepl("\\.j", x)) {
gsub("\\.j", "\\.i", x)
} else x
})
) %>%
rbind(david.div)
david.div[, distance := sqrt(jsd)]
david.div[, day.delta := as.numeric(day.j) - as.numeric(day.i)]
david.div[subject.i == subject.j, increment := subj.idx.j - subj.idx.i]
The magnitudes of day-to-day changes in composition don’t reflect significant events.
setkey(david.div, subject.i, subject.j)
event.lims <- events[, .(start = min(day), end = max(day)),
by = .(event, subject)]
event.lims <- event.lims[!grepl("pre", event) & !grepl("post", event)]
event.lims[, event := factor(levels = c("travel", "diarrhea 1", "diarrhea 2",
"Salmonella"), event)]
david.div[subject.i == subject.j & day.delta == 1] %>%
mutate(day.i = as.numeric(day.i)) %>%
setnames("subject.i", "subject") %>%
ggplot(aes(x = day.i, y = distance)) +
geom_rect(data = event.lims,
mapping = aes(fill = event, xmin = start, xmax = end,
ymin = 0.1, ymax = 1.05
), inherit.aes = FALSE, alpha = 0.5) +
geom_point() +
facet_wrap(~ subject, nrow = 2)
Distributions of all distances for all time scales. We see that both length distributions are multimodal, indicating at least 2 compositional “length scales.” We also see that the vast majority of A distances belong to the lowest mode, while the second lowest mode of B is as high as the lowest, indicating that subject B spends a significant amount of time (relative to the length of the entire measurement).
p0 <- ggplot(david.div[subject.i == subject.j], aes(x = distance))
ar <- 3 / 5
cdf <- p0 + stat_ecdf(aes(color = subject.i)) + labs(y = "ECDF") +
theme(aspect.ratio = ar)
dens <- p0 + stat_density(aes(fill = subject.i), alpha = 0.5,
position = "identity") +
theme(aspect.ratio = ar)
plot_grid(cdf, dens, nrow = 2, align = "v")
david.subjects <- c("A", "B")
names(david.subjects) <- david.subjects
setkey(david.div, subject.i, subject.j)
dm <- MakeDistMatrix(david.div, "sample.i", "sample.j")
fit <- CoordCMDS(dm)
xform <- fit$points
setcolorder(xform, c("sample", "x", "y"))
setkey(xform, sample)
# put sample labels at first 2 columns
setcolorder(david.div, c("sample.i", "sample.j",
names(david.div)[!(names(david.div) %in%
c("sample.i", "sample.j"))])
)
david.samples[, day := as.numeric(day)]
setcolorder(david.samples, c("sample", "subject", "day", "event", "idx",
"subj.idx"))
graf <- graph_from_data_frame(
david.div[subject.i == subject.j & increment == 1],
directed = TRUE,
vertices = david.samples
)
setkey(xform, sample)
xform <- xform[V(graf)$name]
lo <- create_layout(graf, "manual", node.positions = xform)
nsize <- 1
p1 <- ggraph(lo) +
geom_edge_link(arrow = arrow(length = unit(3, "points"), type = "closed"),
end_cap = square(length = 3, unit = "points"),
edge_width = 0.2) +
geom_node_point(aes(color = day), size = nsize) +
scale_color_distiller(palette = "Blues") +
facet_nodes(~ subject) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
p2 <- ggraph(lo) +
geom_edge_link(arrow = arrow(length = unit(3, "points"), type = "closed"),
end_cap = square(length = 3, unit = "points"),
edge_width = 0.2) +
geom_node_point(aes(color = event), size = nsize) +
facet_nodes(~ subject) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
plot_grid(p1, p2, nrow = 2, align = "v")
Laying both subjects along the same axes we see each subject is more similar to themselves than they are to the other subject, for the most part. The upper right represents states in which the two subjects resembled each other. Interestingly not all these points seem to be close in time. Subject B makes a brief excursion to a Subject A-like state post Salmonella.
Eigenvalues by rank:
ggplot(fit$eigrank, aes(x = rank, y = value ^ 2)) +
geom_point()
Hierarchical clustering reveals 3 major branches: characteristic of Subject A, characteristic of Subject B, and Shared Outlier State.
dm <- dcast(david.div, sample.i ~ sample.j, value.var = "distance")
rn <- dm$sample.i
dm <- dm[, -1]
dm <- as.matrix(dm)
rownames(dm) <- rn
dendro <- as.dendrogram(hclust(dist(dm)))
unique.day.events[, day := as.character(day)]
setkey(unique.day.events, subject, day)
david.samples[, day := as.character(day)]
david.samples[, state := {
subj <- subject
dd <- day
unique.day.events[.(subj, dd), event]
}, by = .(subject, day)]
setkey(david.samples, sample)
dendro <- dendrapply(dendro, function(d) {
if (is.leaf(d)) {
samp <- attr(d, "label")
subj <- david.samples[samp, subject]
state <- david.samples[samp, state]
attr(d, "nodePar") <- append(attr(d, "nodePar"), list(subject = subj,
state = state))
}
d
})
ggraph(dendro, "dendrogram") +
geom_edge_elbow() +
geom_node_point(aes(filter = leaf, shape = subject, color = state), size = 1) +
theme_graph(base_family = "Helvetica") +
# theme(aspect.ratio = 1) +
coord_flip()
Clustering separately shows that Subject B has distinct pre- and post-Salmonella states, but the various events in Subject A’s life didn’t create alternate stable states that were significantly different.
setkey(david.div, subject.i, subject.j)
dendros <- lapply(david.subjects, function(subj) {
sd <- david.div[.(subj, subj)]
dm <- dcast(sd, sample.i ~ sample.j, value.var = "distance")
rn <- dm$sample.i
dm <- dm[, -1]
dm <- as.matrix(dm)
rownames(dm) <- rn
hc <- hclust(dist(dm))
dendro <- as.dendrogram(hc)
return(dendro)
}
)
unique.day.events[, day := as.character(day)]
# setkey(events, subject)
setkey(unique.day.events, subject, day)
setkey(david.samples, sample)
dendros <- lapply(david.subjects, function(subj) {
dendro <- dendros[[subj]]
dendro <- dendrapply(dendro, function(d) {
if (is.leaf(d)) {
dday <- david.samples[attr(d, "label"), as.character(day)]
ev <- unique.day.events[.(subj, dday), event]
attr(d, "nodePar") <- append(attr(d, "nodePar"), list(
day = as.numeric(dday),
event = ev))
}
d
})
# propagate metadata back up the tree
dendro <- tree_apply(dendro, function(node, children, depth, tree) {
if (!is.leaf(node)) {
events <- sapply(children, function(c) {
attr(c, "nodePar")$event
})
events <- unique(events)
if (length(events) == 1 & !anyNA(events)) {
attr(node, "nodePar") <- append(attr(node, "nodePar"), list(event = events))
} else {
attr(node, "nodePar") <- append(attr(node, "nodePar"), list(event = NA))
}
}
node
}, direction = "up")
dendro
})
dendro.plots <- lapply(dendros, function(dend) {
ggraph(dend, "dendrogram", circular = TRUE) +
geom_edge_elbow(aes(color = node2.event)) +
geom_node_point(aes(filter = leaf, color = day), size = 0.6) +
scale_color_distiller(palette = "Spectral") +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
})
plot_grid(plotlist = dendro.plots, nrow = 2, labels = "AUTO")
Subject A doesn’t seem to have a distinct “traveling” composition. Indeed it seems the distance between traveling and US compositions doesn’t seem noticeably larger than the distance between US compositions. Meaning traveling did not destabilize the microbiome any more than baseline fluctuations while living in the US. Perhaps this is due to many OTUs remaining stable through the duration of travel.
Subject B shows clear pre- and post-Salmonella states, as reported in the original manuscript. Perhaps the post-Salmonella state is separate because of the extinction of cluster 4 (see original paper) during infection, rendering return to the pre-Salmonella state impossible?
# metadata
setkey(david.samples, subject)
events <- mapply(function(start, end, subject, event) {
x <- data.table(day = seq(start, end, by = 1), subject = subject,
event = event)
x
}, start = c(0, 71, 80, 104, 123,
0, 151, 160 ),
end = c(70, 122, 85, 113, david.samples["A", max(as.numeric(day))],
150, 159, david.samples["B", max(as.numeric(day))]),
subject = c("A", "A", "A", "A", "A",
"B", "B", "B"),
event = c("US (pre)", "travel", "diarrhea 1", "diarrhea 2", "US (post)",
"pre-Salmonella", "Salmonella", "post-Salmonella"),
SIMPLIFY = FALSE) %>% rbindlist(use.names = TRUE)
# collapse event labels per day
unique.day.events <- events[, .(event = paste(event, collapse = " + ")),
by = .(subject, day)]
unique.day.events[, day := as.character(day)]
david.samples <- merge(david.samples, unique.day.events,
by = c("subject", "day"))
david.div <- merge(david.div, unique.day.events, by.x = c("subject.i", "day.i"),
by.y = c("subject", "day"))
david.div <- merge(david.div, unique.day.events, by.x = c("subject.j", "day.j"),
by.y = c("subject", "day"), suffixes = c(".i", ".j"))
David-style divergence matrices. Dark on-diagonal squares represent occupation of stable states. Most patients have distinct stable states.
setkey(gordon.div, subject.i, subject.j)
subjects <- sort(subjects)
names(subjects) <- subjects
gordon.div[, ":=" (hour.i = as.character(hour.i), hour.j = as.character(hour.j))]
plots <- lapply(subjects, function(subj) {
sd <- gordon.div[.(subj, subj)]
ts <- as.numeric(unique(sd$hour.i))
lims <- as.character(sort(ts))
p <- ggplot(sd, aes(x = hour.i, y = hour.j)) +
geom_tile(aes(fill = jsd)) +
scale_x_discrete(limits = lims) +
scale_y_discrete(limits = lims) +
theme(aspect.ratio = 1,
axis.text = element_blank(),
axis.ticks = element_blank())
})
plot_grid(plotlist = plots, ncol = 2, labels = names(plots), align = "hv")
Time-lag divergences:
gordon.div[, delta.t := as.numeric(hour.j) - as.numeric(hour.i)]
plots <- lapply(subjects, function(subj) {
sd <- gordon.div[.(subj, subj)]
sd <- sd[delta.t > 0]
p <- ggplot(sd, aes(x = delta.t, y = jsd)) +
geom_point(size = 0.1) +
geom_smooth(color = "blue") +
scale_x_log10() +
labs(x = "lag (hours)")
p
})
plot_grid(plotlist = plots, align = "hv", labels = names(plots),
nrow = 3)
`geom_smooth()` using method = 'gam'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
Distributions of divergences show that for all patients, diarrhea and recovery states are more similar to themselves than to each other (cross-state divergences are large). In most patients (except E) the recovery microbiome is actually less stable than the diarrhea microbiome: the diarrhea divergences tend to be smaller. However this could also be due to the denser temporal sampling during the diarrhea period. We should scale this by the time interval somehow, since it varies.
gordon.div[, state := {
if (state.i == state.j) state.i
else "cross"
}, by = .(state.i, state.j)]
p0 <- gordon.div[(subject.i == subject.j) & (delta.t > 0)] %>%
ggplot(aes(x = distance))
cdf <- p0 + stat_ecdf(aes(color = state)) + facet_wrap(~ subject.i, nrow = 1) +
labs(y = "ECDF") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
dens <- p0 + stat_density(aes(fill = state), alpha = 0.3,
position = "identity")
dens <- dens +
facet_wrap(~ subject.i, nrow = 1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plot_grid(cdf, dens, nrow = 2, align = "v")
MDS of all subjects simultaneously shows that diarrhea/recovery separates time points more than subject.
setkey(gordon.div, subject.i, subject.j)
setkey(gordon.samples, subject)
edges <- lapply(subjects, function(subj) {
sd <- gordon.div[.(subj, subj)]
ts <- as.numeric(unique(sd$hour.i))
ts <- as.character(sort(ts))
ej <- data.table(ti = ts[-length(ts)], tj = ts[-1])
samps <- gordon.samples[subj, .(sample, hour)]
samps[, hour := as.character(hour)]
ej <- merge(ej, samps, by.x = "ti", by.y = "hour")
ej <- merge(ej, samps, by.x = "tj", by.y = "hour", suffixes = c(".i", ".j"))
ej[, .(sample.i, sample.j)]
}) %>% rbindlist()
graf <- graph_from_data_frame(edges,
directed = TRUE, vertices = gordon.samples)
dm <- MakeDistMatrix(gordon.div, "sample.i", "sample.j")
fit <- CoordCMDS(dm)
xform <- fit$points
# order
setkey(xform, sample)
xform <- xform[V(graf)$name]
lo <- create_layout(graf, "manual", node.positions = xform)
ggraph(lo) +
geom_edge_link(arrow = arrow(type = "closed", length = unit(3, "points")),
edge_width = 0.2, end_cap = square(length = 3,
unit = "points")) +
geom_node_point(aes(color = state)) +
# scale_shape_manual(values = seq(1, length(subjects))) +
facet_nodes(~ subject, nrow = 2) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
ggplot(fit$eigrank, aes(x = rank, y = value ^ 2)) +
geom_point()
All points plot show recovery and diarrhea cluster separately for all patients, with diarrhea having greater spread:
ggraph(lo) +
geom_node_point(aes(color = state, shape = subject)) +
scale_shape_manual(values = seq(uniqueN(gordon.samples$subject))) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
All together. Again separation is mostly diarrhea/recovery. However, some diarrhea points are closer to recovery points than they are to the majority of diarrhea points. This can also be seen in the last plot where the two ‘clouds’ enmesh.
ggraph(lo) +
geom_node_point(aes(color = state, shape = subject)) +
scale_shape_manual(values = seq(uniqueN(gordon.samples$subject))) +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1)
setkey(gordon.div, subject.i, subject.j)
gordon.samples[, hour := as.character(hour)]
setkey(gordon.samples, subject, hour)
trees <- lapply(subjects, function(subj) {
sd <- gordon.div[.(subj, subj)]
sd <- dcast(sd, hour.i ~ hour.j, value.var = "distance")
rn <- sd[, hour.i]
dm <- as.matrix(sd[, -1])
rownames(dm) <- rn
dendro <- as.dendrogram(hclust(dist(dm)))
# merge time data and state
sd[, time.index := frank(as.numeric(hour.i))]
setkey(sd, hour.i)
dendro <- dendrapply(dendro, function(node) {
if (is.leaf(node)) {
labl <- attr(node, "label")
t <- as.numeric(labl)
time.index <- as.numeric(sd[labl, time.index])
state <- gordon.samples[.(subj, labl), state]
attr(node, "nodePar") <- append(attr(node, "nodePar"),
list(hour = t, time.index = time.index,
state = state)
)
}
node
})
# propagate state upwards through branch nodes
dendro <- tree_apply(dendro, function(node, tree, depth, children) {
if (!is.leaf(node)) {
child.states <- sapply(children, function(n) {
attr(n, "nodePar")$state
}) %>% unique()
if (length(child.states) == 1 & !anyNA(child.states)) {
state <- child.states
} else {
state <- NA
}
attr(node, "nodePar") <- append(attr(node, "nodePar"),
list(state = state))
}
node
}, direction = "up")
# browser()
p <- ggraph(dendro, "dendrogram", circular = TRUE) +
geom_edge_elbow(aes(color = node2.state)) +
geom_node_point(aes(filter = leaf, color = time.index)) +
scale_color_distiller(palette = "YlOrRd") +
theme_graph(base_family = "Helvetica") +
theme(aspect.ratio = 1, legend.key.height = unit(10, "points"))
})
plot_grid(plotlist = trees, ncol = 2, labels = names(trees))